library(dplyr)
library(vroom)
library(here)
library(skimr)
library(tidyverse)
library(tidyquant)
library(mosaic)
library(GGally)
library(ggplot2)
library(readr)
library(leaflet)
library(ggfortify)
library(janitor)
library(broom)
library(car)
library(huxtable)
library(flextable)
library(kableExtra)
library(ggthemes)
library(sjPlot)
library(sjmisc)
library(sjlabelled)
library(ggcorrplot)

1 Load data

# Read airbnb data
listings <- vroom("http://data.insideairbnb.com/singapore/sg/singapore/2020-06-22/data/listings.csv.gz") %>%
  clean_names()

2 Exploratory Data Analysis

2.1 Looking at the raw values

glimpse(listings)
## Rows: 7,323
## Columns: 106
## $ id                                           <dbl> 49091, 50646, 56334, 716…
## $ listing_url                                  <chr> "https://www.airbnb.com/…
## $ scrape_id                                    <dbl> 2.02e+13, 2.02e+13, 2.02…
## $ last_scraped                                 <date> 2020-06-22, 2020-06-22,…
## $ name                                         <chr> "COZICOMFORT LONG TERM S…
## $ summary                                      <chr> NA, "Fully furnished bed…
## $ space                                        <chr> "This is Room No. 2.(ava…
## $ description                                  <chr> "This is Room No. 2.(ava…
## $ experiences_offered                          <chr> "none", "none", "none", …
## $ neighborhood_overview                        <chr> NA, "The serenity & quie…
## $ notes                                        <chr> NA, "Accommodation has a…
## $ transit                                      <chr> NA, "Less than 400m from…
## $ access                                       <chr> NA, "Kitchen, washing fa…
## $ interaction                                  <chr> NA, "We love to host peo…
## $ house_rules                                  <chr> "No smoking indoors. Ple…
## $ thumbnail_url                                <lgl> NA, NA, NA, NA, NA, NA, …
## $ medium_url                                   <lgl> NA, NA, NA, NA, NA, NA, …
## $ picture_url                                  <chr> "https://a0.muscache.com…
## $ xl_picture_url                               <lgl> NA, NA, NA, NA, NA, NA, …
## $ host_id                                      <dbl> 266763, 227796, 266763, …
## $ host_url                                     <chr> "https://www.airbnb.com/…
## $ host_name                                    <chr> "Francesca", "Sujatha", …
## $ host_since                                   <date> 2010-10-20, 2010-09-08,…
## $ host_location                                <chr> "singapore", "Singapore,…
## $ host_about                                   <chr> "I am a private tutor by…
## $ host_response_time                           <chr> "within an hour", "N/A",…
## $ host_response_rate                           <chr> "100%", "N/A", "100%", "…
## $ host_acceptance_rate                         <chr> "N/A", "N/A", "N/A", "10…
## $ host_is_superhost                            <lgl> FALSE, FALSE, FALSE, FAL…
## $ host_thumbnail_url                           <chr> "https://a0.muscache.com…
## $ host_picture_url                             <chr> "https://a0.muscache.com…
## $ host_neighbourhood                           <chr> "Woodlands", "Bukit Tima…
## $ host_listings_count                          <dbl> 2, 1, 2, 8, 8, 8, 8, 4, …
## $ host_total_listings_count                    <dbl> 2, 1, 2, 8, 8, 8, 8, 4, …
## $ host_verifications                           <chr> "['email', 'phone', 'fac…
## $ host_has_profile_pic                         <lgl> TRUE, TRUE, TRUE, TRUE, …
## $ host_identity_verified                       <lgl> FALSE, FALSE, FALSE, TRU…
## $ street                                       <chr> "Singapore, Singapore", …
## $ neighbourhood                                <chr> "Woodlands", "Bukit Tima…
## $ neighbourhood_cleansed                       <chr> "Woodlands", "Bukit Tima…
## $ neighbourhood_group_cleansed                 <chr> "North Region", "Central…
## $ city                                         <chr> "Singapore", "Singapore"…
## $ state                                        <chr> NA, NA, NA, NA, NA, NA, …
## $ zipcode                                      <chr> "730702", "589664", NA, …
## $ market                                       <chr> "Singapore", "Singapore"…
## $ smart_location                               <chr> "Singapore", "Singapore"…
## $ country_code                                 <chr> "SG", "SG", "SG", "SG", …
## $ country                                      <chr> "Singapore", "Singapore"…
## $ latitude                                     <dbl> 1.44, 1.33, 1.44, 1.35, …
## $ longitude                                    <dbl> 104, 104, 104, 104, 104,…
## $ is_location_exact                            <lgl> TRUE, TRUE, TRUE, TRUE, …
## $ property_type                                <chr> "Apartment", "Apartment"…
## $ room_type                                    <chr> "Private room", "Private…
## $ accommodates                                 <dbl> 1, 2, 1, 6, 3, 3, 6, 1, …
## $ bathrooms                                    <dbl> 1.0, 1.0, 1.0, 1.0, 0.5,…
## $ bedrooms                                     <dbl> 1, 1, 1, 2, 1, 1, 1, 1, …
## $ beds                                         <dbl> 1, 1, 1, 3, 1, 2, 7, 1, …
## $ bed_type                                     <chr> "Real Bed", "Real Bed", …
## $ amenities                                    <chr> "{TV,\"Cable TV\",Intern…
## $ square_feet                                  <dbl> 0, NA, 0, 205, NA, NA, 4…
## $ price                                        <chr> "$84.00", "$80.00", "$70…
## $ weekly_price                                 <chr> NA, "$400.00", NA, NA, "…
## $ monthly_price                                <chr> "$1,048.00", "$1,600.00"…
## $ security_deposit                             <chr> NA, NA, NA, "$279.00", "…
## $ cleaning_fee                                 <chr> NA, NA, NA, "$56.00", "$…
## $ guests_included                              <dbl> 1, 2, 1, 4, 1, 1, 4, 1, …
## $ extra_people                                 <chr> "$14.00", "$20.00", "$14…
## $ minimum_nights                               <dbl> 180, 90, 6, 90, 90, 90, …
## $ maximum_nights                               <dbl> 360, 730, 14, 1125, 1125…
## $ minimum_minimum_nights                       <dbl> 180, 90, 6, 90, 90, 90, …
## $ maximum_minimum_nights                       <dbl> 180, 90, 6, 90, 90, 90, …
## $ minimum_maximum_nights                       <dbl> 360, 730, 14, 1125, 1125…
## $ maximum_maximum_nights                       <dbl> 360, 730, 14, 1125, 1125…
## $ minimum_nights_avg_ntm                       <dbl> 180, 90, 6, 90, 90, 90, …
## $ maximum_nights_avg_ntm                       <dbl> 360, 730, 14, 1125, 1125…
## $ calendar_updated                             <chr> "73 months ago", "71 mon…
## $ has_availability                             <lgl> TRUE, TRUE, TRUE, TRUE, …
## $ availability_30                              <dbl> 30, 30, 30, 30, 30, 30, …
## $ availability_60                              <dbl> 60, 60, 60, 60, 60, 60, …
## $ availability_90                              <dbl> 90, 90, 90, 90, 90, 90, …
## $ availability_365                             <dbl> 365, 365, 365, 365, 365,…
## $ calendar_last_scraped                        <date> 2020-06-22, 2020-06-22,…
## $ number_of_reviews                            <dbl> 1, 18, 20, 20, 24, 48, 2…
## $ number_of_reviews_ltm                        <dbl> 0, 0, 0, 8, 4, 13, 6, 2,…
## $ first_review                                 <date> 2013-10-21, 2014-04-18,…
## $ last_review                                  <date> 2013-10-21, 2014-12-26,…
## $ review_scores_rating                         <dbl> 94, 91, 98, 89, 83, 88, …
## $ review_scores_accuracy                       <dbl> 10, 9, 10, 9, 8, 9, 9, 1…
## $ review_scores_cleanliness                    <dbl> 10, 10, 10, 8, 8, 9, 8, …
## $ review_scores_checkin                        <dbl> 10, 10, 10, 9, 9, 9, 9, …
## $ review_scores_communication                  <dbl> 10, 10, 10, 10, 9, 9, 9,…
## $ review_scores_location                       <dbl> 8, 9, 8, 9, 8, 9, 9, 10,…
## $ review_scores_value                          <dbl> 8, 9, 9, 9, 8, 9, 8, 10,…
## $ requires_license                             <lgl> FALSE, FALSE, FALSE, FAL…
## $ license                                      <lgl> NA, NA, NA, NA, NA, NA, …
## $ jurisdiction_names                           <lgl> NA, NA, NA, NA, NA, NA, …
## $ instant_bookable                             <lgl> FALSE, FALSE, FALSE, TRU…
## $ is_business_travel_ready                     <lgl> FALSE, FALSE, FALSE, FAL…
## $ cancellation_policy                          <chr> "flexible", "moderate", …
## $ require_guest_profile_picture                <lgl> TRUE, FALSE, TRUE, FALSE…
## $ require_guest_phone_verification             <lgl> TRUE, TRUE, TRUE, TRUE, …
## $ calculated_host_listings_count               <dbl> 2, 1, 2, 8, 8, 8, 8, 3, …
## $ calculated_host_listings_count_entire_homes  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ calculated_host_listings_count_private_rooms <dbl> 2, 1, 2, 8, 8, 8, 8, 3, …
## $ calculated_host_listings_count_shared_rooms  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ reviews_per_month                            <dbl> 0.01, 0.24, 0.18, 0.19, …

We can observe that there are 106 variables/columns, with 7,323 observations/rows.

The data we should coerce into different types are as below: 1) price 2) weekly_price 3) monthly_price 4) security_deposit 5) cleaning_fee 6) extra_people

Let’s convert these characters to numeric.

# Transforming characters into numbers
listings <- listings %>%
  mutate(price = parse_number(price)) %>%
  mutate(weekly_price = parse_number(weekly_price)) %>%
  mutate(monthly_price = parse_number(monthly_price)) %>%
  mutate(security_deposit = parse_number(security_deposit)) %>%
  mutate(cleaning_fee = parse_number(cleaning_fee)) %>%
  mutate(extra_people = parse_number(extra_people))

# Check if coercion was successful
typeof(listings$price)
## [1] "double"
typeof(listings$extra_people)
## [1] "double"

2.2 Finding NAs

skim(listings)
Data summary
Name listings
Number of rows 7323
Number of columns 106
_______________________
Column type frequency:
character 41
Date 5
logical 15
numeric 45
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
listing_url 0 1.00 34 37 0 7323 0
name 1 1.00 1 92 0 6766 0
summary 324 0.96 1 1000 0 4367 0
space 1957 0.73 1 1000 0 3139 0
description 249 0.97 1 1000 0 5180 0
experiences_offered 0 1.00 4 4 0 1 0
neighborhood_overview 2933 0.60 3 1000 0 2137 0
notes 3315 0.55 1 1000 0 1634 0
transit 2895 0.60 1 1000 0 2212 0
access 2821 0.61 1 1000 0 2002 0
interaction 3250 0.56 1 1000 0 1665 0
house_rules 3874 0.47 1 1000 0 2059 0
picture_url 0 1.00 81 146 0 6777 0
host_url 0 1.00 39 43 0 2466 0
host_name 22 1.00 1 35 0 1739 0
host_location 41 0.99 2 183 0 217 0
host_about 2441 0.67 1 2858 0 1174 1
host_response_time 22 1.00 3 18 0 5 0
host_response_rate 22 1.00 2 4 0 56 0
host_acceptance_rate 22 1.00 2 4 0 79 0
host_thumbnail_url 22 1.00 55 106 0 2448 0
host_picture_url 22 1.00 57 109 0 2448 0
host_neighbourhood 842 0.89 4 18 0 62 0
host_verifications 0 1.00 2 158 0 187 0
street 0 1.00 13 61 0 93 0
neighbourhood 2 1.00 5 18 0 45 0
neighbourhood_cleansed 0 1.00 4 23 0 43 0
neighbourhood_group_cleansed 0 1.00 11 17 0 5 0
city 64 0.99 1 20 0 39 0
state 6817 0.07 1 39 0 50 0
zipcode 818 0.89 1 9 0 1975 0
market 90 0.99 9 11 0 2 0
smart_location 0 1.00 9 31 0 43 0
country_code 0 1.00 2 2 0 1 0
country 0 1.00 9 9 0 1 0
property_type 0 1.00 3 22 0 26 0
room_type 0 1.00 10 15 0 4 0
bed_type 0 1.00 5 13 0 5 0
amenities 0 1.00 2 977 0 5621 0
calendar_updated 0 1.00 5 13 0 79 0
cancellation_policy 0 1.00 8 27 0 5 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
last_scraped 0 1.00 2020-06-22 2020-06-23 2020-06-22 2
host_since 22 1.00 2009-06-29 2020-06-14 2016-04-09 1576
calendar_last_scraped 0 1.00 2020-06-22 2020-06-23 2020-06-22 2
first_review 2835 0.61 2011-05-04 2020-06-22 2018-08-10 1730
last_review 2835 0.61 2013-10-21 2020-06-22 2019-12-25 1158

Variable type: logical

skim_variable n_missing complete_rate mean count
thumbnail_url 7323 0 NaN :
medium_url 7323 0 NaN :
xl_picture_url 7323 0 NaN :
host_is_superhost 22 1 0.16 FAL: 6143, TRU: 1158
host_has_profile_pic 22 1 1.00 TRU: 7282, FAL: 19
host_identity_verified 22 1 0.22 FAL: 5668, TRU: 1633
is_location_exact 0 1 0.80 TRU: 5845, FAL: 1478
has_availability 0 1 1.00 TRU: 7323
requires_license 0 1 0.00 FAL: 7323
license 7323 0 NaN :
jurisdiction_names 7323 0 NaN :
instant_bookable 0 1 0.42 FAL: 4227, TRU: 3096
is_business_travel_ready 0 1 0.00 FAL: 7323
require_guest_profile_picture 0 1 0.00 FAL: 7289, TRU: 34
require_guest_phone_verification 0 1 0.01 FAL: 7276, TRU: 47

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id 0 1.00 2.69e+07 1.22e+07 4.91e+04 1.70e+07 2.90e+07 3.83e+07 4.39e+07 ▂▃▅▆▇
scrape_id 0 1.00 2.02e+13 0.00e+00 2.02e+13 2.02e+13 2.02e+13 2.02e+13 2.02e+13 ▁▁▇▁▁
host_id 0 1.00 1.06e+08 9.58e+07 2.37e+04 2.54e+07 6.64e+07 1.74e+08 3.50e+08 ▇▂▂▂▁
host_listings_count 22 1.00 4.96e+01 9.04e+01 0.00e+00 1.00e+00 9.00e+00 5.10e+01 3.92e+02 ▇▁▁▁▁
host_total_listings_count 22 1.00 4.96e+01 9.04e+01 0.00e+00 1.00e+00 9.00e+00 5.10e+01 3.92e+02 ▇▁▁▁▁
latitude 0 1.00 1.31e+00 3.00e-02 1.24e+00 1.30e+00 1.31e+00 1.32e+00 1.45e+00 ▂▇▂▁▁
longitude 0 1.00 1.04e+02 4.00e-02 1.04e+02 1.04e+02 1.04e+02 1.04e+02 1.04e+02 ▁▁▃▇▁
accommodates 0 1.00 3.25e+00 2.51e+00 1.00e+00 2.00e+00 2.00e+00 4.00e+00 1.60e+01 ▇▂▁▁▁
bathrooms 3 1.00 1.54e+00 1.16e+00 0.00e+00 1.00e+00 1.00e+00 2.00e+00 2.10e+01 ▇▁▁▁▁
bedrooms 12 1.00 1.31e+00 8.70e-01 0.00e+00 1.00e+00 1.00e+00 2.00e+00 1.00e+01 ▇▁▁▁▁
beds 71 0.99 1.98e+00 2.52e+00 0.00e+00 1.00e+00 1.00e+00 2.00e+00 5.80e+01 ▇▁▁▁▁
square_feet 7292 0.00 2.05e+02 2.46e+02 0.00e+00 0.00e+00 2.00e+02 3.18e+02 1.00e+03 ▇▃▁▁▁
price 0 1.00 1.76e+02 4.83e+02 1.40e+01 6.30e+01 1.24e+02 1.90e+02 1.30e+04 ▇▁▁▁▁
weekly_price 6857 0.06 8.36e+02 9.80e+02 1.00e+02 3.99e+02 5.70e+02 1.00e+03 1.47e+04 ▇▁▁▁▁
monthly_price 6826 0.07 2.90e+03 3.51e+03 4.50e+02 1.30e+03 2.10e+03 3.60e+03 5.50e+04 ▇▁▁▁▁
security_deposit 2217 0.70 2.31e+02 5.25e+02 0.00e+00 0.00e+00 1.50e+02 2.50e+02 6.99e+03 ▇▁▁▁▁
cleaning_fee 1947 0.73 3.89e+01 4.29e+01 0.00e+00 1.50e+01 3.00e+01 5.00e+01 9.00e+02 ▇▁▁▁▁
guests_included 0 1.00 1.87e+00 1.60e+00 1.00e+00 1.00e+00 1.00e+00 2.00e+00 5.00e+01 ▇▁▁▁▁
extra_people 0 1.00 1.47e+01 2.16e+01 0.00e+00 0.00e+00 1.00e+01 2.00e+01 4.04e+02 ▇▁▁▁▁
minimum_nights 0 1.00 2.07e+01 4.69e+01 1.00e+00 2.00e+00 4.00e+00 1.80e+01 1.00e+03 ▇▁▁▁▁
maximum_nights 0 1.00 9.27e+02 1.25e+03 1.00e+00 1.12e+03 1.12e+03 1.12e+03 1.00e+05 ▇▁▁▁▁
minimum_minimum_nights 0 1.00 2.01e+01 4.35e+01 1.00e+00 1.00e+00 4.00e+00 1.80e+01 1.00e+03 ▇▁▁▁▁
maximum_minimum_nights 0 1.00 2.07e+01 4.42e+01 1.00e+00 2.00e+00 5.00e+00 1.80e+01 1.00e+03 ▇▁▁▁▁
minimum_maximum_nights 0 1.00 8.81e+05 4.35e+07 1.00e+00 1.12e+03 1.12e+03 1.12e+03 2.15e+09 ▇▁▁▁▁
maximum_maximum_nights 0 1.00 1.17e+06 5.02e+07 1.00e+00 1.12e+03 1.12e+03 1.12e+03 2.15e+09 ▇▁▁▁▁
minimum_nights_avg_ntm 0 1.00 2.03e+01 4.37e+01 1.00e+00 2.00e+00 4.00e+00 1.80e+01 1.00e+03 ▇▁▁▁▁
maximum_nights_avg_ntm 0 1.00 1.17e+06 5.02e+07 1.00e+00 1.12e+03 1.12e+03 1.12e+03 2.15e+09 ▇▁▁▁▁
availability_30 0 1.00 1.58e+01 1.35e+01 0.00e+00 0.00e+00 2.10e+01 2.90e+01 3.00e+01 ▇▁▁▂▇
availability_60 0 1.00 3.43e+01 2.69e+01 0.00e+00 0.00e+00 5.10e+01 5.90e+01 6.00e+01 ▆▁▁▁▇
availability_90 0 1.00 5.33e+01 4.01e+01 0.00e+00 0.00e+00 8.10e+01 8.90e+01 9.00e+01 ▅▁▁▁▇
availability_365 0 1.00 1.95e+02 1.59e+02 0.00e+00 1.00e+00 1.81e+02 3.63e+02 3.65e+02 ▆▂▂▁▇
number_of_reviews 0 1.00 1.25e+01 3.16e+01 0.00e+00 0.00e+00 1.00e+00 8.00e+00 3.70e+02 ▇▁▁▁▁
number_of_reviews_ltm 0 1.00 3.79e+00 9.27e+00 0.00e+00 0.00e+00 0.00e+00 2.00e+00 1.39e+02 ▇▁▁▁▁
review_scores_rating 2969 0.59 9.06e+01 1.25e+01 2.00e+01 8.70e+01 9.40e+01 1.00e+02 1.00e+02 ▁▁▁▂▇
review_scores_accuracy 2974 0.59 9.23e+00 1.28e+00 2.00e+00 9.00e+00 1.00e+01 1.00e+01 1.00e+01 ▁▁▁▁▇
review_scores_cleanliness 2972 0.59 8.99e+00 1.36e+00 2.00e+00 9.00e+00 9.00e+00 1.00e+01 1.00e+01 ▁▁▁▂▇
review_scores_checkin 2978 0.59 9.51e+00 1.07e+00 2.00e+00 9.00e+00 1.00e+01 1.00e+01 1.00e+01 ▁▁▁▁▇
review_scores_communication 2974 0.59 9.49e+00 1.11e+00 2.00e+00 9.00e+00 1.00e+01 1.00e+01 1.00e+01 ▁▁▁▁▇
review_scores_location 2979 0.59 9.38e+00 1.04e+00 2.00e+00 9.00e+00 1.00e+01 1.00e+01 1.00e+01 ▁▁▁▁▇
review_scores_value 2978 0.59 8.96e+00 1.27e+00 2.00e+00 9.00e+00 9.00e+00 1.00e+01 1.00e+01 ▁▁▁▂▇
calculated_host_listings_count 0 1.00 4.37e+01 7.52e+01 1.00e+00 2.00e+00 1.00e+01 5.20e+01 3.35e+02 ▇▂▁▁▁
calculated_host_listings_count_entire_homes 0 1.00 3.18e+01 7.26e+01 0.00e+00 0.00e+00 1.00e+00 2.30e+01 3.30e+02 ▇▁▁▁▁
calculated_host_listings_count_private_rooms 0 1.00 8.90e+00 2.47e+01 0.00e+00 0.00e+00 1.00e+00 4.00e+00 1.36e+02 ▇▁▁▁▁
calculated_host_listings_count_shared_rooms 0 1.00 4.50e-01 2.11e+00 0.00e+00 0.00e+00 0.00e+00 0.00e+00 2.00e+01 ▇▁▁▁▁
reviews_per_month 2835 0.61 7.60e-01 1.33e+00 1.00e-02 1.10e-01 3.00e-01 8.70e-01 2.87e+01 ▇▁▁▁▁

Let’s change NAs to meaningful entries. For example missing security_deposit or cleaning_fee can very well mean they are zero. Both of their completion rates are about 70%.

# Handling missing values
listings <- listings %>%
  mutate(cleaning_fee = case_when(
    is.na(cleaning_fee) ~ 0, 
    TRUE ~ cleaning_fee
  )) %>%
  mutate(security_deposit = case_when(
    is.na(security_deposit) ~ 0, 
    TRUE ~ security_deposit
  ))

Count distinct property types

# Count and calculate proportion
type <- listings %>%
  count(property_type) %>%
  arrange(desc(n)) %>%
  mutate(proportion = 100 * n/sum(n))
type %>% 
  kable()
property_type n proportion
Apartment 2882 39.355
Condominium 2136 29.168
Serviced apartment 657 8.972
House 501 6.841
Hostel 319 4.356
Hotel 170 2.321
Boutique hotel 155 2.117
Townhouse 112 1.529
Loft 105 1.434
Bed and breakfast 76 1.038
Bungalow 66 0.901
Other 57 0.778
Guest suite 18 0.246
Guesthouse 17 0.232
Villa 16 0.218
Aparthotel 10 0.137
Boat 7 0.096
Tent 6 0.082
Campsite 3 0.041
Chalet 3 0.041
Tiny house 2 0.027
Bus 1 0.014
Cabin 1 0.014
Earth house 1 0.014
Heritage hotel (India) 1 0.014
Igloo 1 0.014
# What % does the top 4 type account for?
top_4_prop <- sum(head(type$proportion, 4))
top_4_prop
## [1] 84.3

We need to add an extra column prop_type_simplified with just 5 categories.

# Create an extra column with simplified 
listings <- listings %>%
  mutate(prop_type_simplified = case_when(
    property_type %in% c("Apartment","Condominium", "Serviced apartment","House") ~ property_type, 
    TRUE ~ "Other"
  ))

After the above operation, a skim should find out that prop_type_simplified has only 5 categories, much more simplified compared to property_type having 26 categories. The check below (provided by the project instruction website) also works.

# Check if prop_type_simplified is correctly created
listings %>%
  count(prop_type_simplified) %>%
  arrange(desc(n)) %>% 
  kable()
prop_type_simplified n
Apartment 2882
Condominium 2136
Other 1147
Serviced apartment 657
House 501

As we are only concerned with listings for travel purpose. We filter data such that we only have observations with minimum_nights <= 4

# Filter to obtain listings for travel purpose
listings1 <- listings %>% 
  filter(minimum_nights <= 4)

# Check if we successfully filtered
fav_stats(listings1$minimum_nights)
minQ1medianQ3maxmeansdnmissing
112341.870.94637130

2.3 Make preliminary conjecture and basic plots

# Does guest_included push up price?
p_v_guest <- listings1 %>%
  ggplot(mapping = aes(x = guests_included, y = price)) +
  geom_point(alpha = 0.5, colour = "red") +
  xlab("guests included")
  
p_v_guest

p_v_cleaning <- listings1 %>%
ggplot(listings1, mapping = aes(x = cleaning_fee, y = price)) +
  geom_point(alpha = 0.5, colour = "blue") +
  xlab("cleaning fee")
p_v_cleaning

It looks like both graphs look too clustered near the origin point. There are also outliers on x-axes with orders of magnitudes higher than points near origin. Let’s apply log() to both axes and see how.

p_v_guest_log <- listings1 %>%
  ggplot(mapping = aes(x = log(guests_included), y = log(price))) +
  geom_point(alpha = 0.5, colour = "red") +
  geom_smooth() +
  xlab("log(guests included)")
p_v_guest_log

p_v_cleaning_log <- listings1 %>%
ggplot(listings1, mapping = aes(x = log(cleaning_fee), y = log(price))) +
  geom_point(alpha = 0.5, colour = "blue") +
  geom_smooth() +
  xlab("log(cleaning fee)")
p_v_cleaning_log

Deduction: It is interesting to see that there is a positive correlation in both cases, meaning certain proportional increases in guests included/cleaning fee can be associated with proportional increases in prices.

3 Visualize locations of the listings in Singapore

leaflet(data = listings1) %>% 
  addProviderTiles("OpenStreetMap.Mapnik") %>% 
  addCircleMarkers(lng = ~longitude, 
                   lat = ~latitude, 
                   radius = 1, 
                   fillColor = "blue", 
                   fillOpacity = 0.4, 
                   popup = ~listing_url,
                   label = ~property_type)

To get a sense of the zoning and the locations of each simplified property type, we change leaflet operations a bit as follows:

# See the unique levels of prop_type_simplified and neighbourhood_group_cleansed
unique(listings1$neighbourhood_group_cleansed)
## [1] "East Region"       "North-East Region" "West Region"      
## [4] "Central Region"    "North Region"
unique(listings1$prop_type_simplified)
## [1] "House"              "Condominium"        "Apartment"         
## [4] "Other"              "Serviced apartment"
# Count each unique levels under prop_type_simplified
n_c_count <- listings1 %>%
  count(neighbourhood_group_cleansed) %>%
  arrange(desc(n)) %>%
  mutate(total = sum(n)) %>%
  mutate(proportion = 100 * n/sum(n))
n_c_count
neighbourhood_group_cleansedntotalproportion
Central Region2868371377.2 
East Region29337137.89
West Region28637137.7 
North-East Region16637134.47
North Region10037132.69
# Create a palette that maps neighbourhood_group_cleansed factor levels to colors
pal1 <- colorFactor(c("navy", "red", "yellow", "orange", "green"), 
                    domain = c("Central Region", "East Region", "West Region","North-East Region", "North Region" ))

# Apply the palette to actual data
leaflet(data = listings1) %>% 
  addTiles() %>%
  addCircleMarkers(
    lng = ~longitude, 
    lat = ~latitude, 
    color = ~pal1(neighbourhood_group_cleansed),
    radius = 3,
    stroke = FALSE, 
    fillOpacity = 0.5)

We speculate that most AirBnb customers could be tourists of other countries rather than locals. From the cluster mapping and count table, Central Region is most concentrated in listings, consistent with the intuition that Central Region has some of the hottest sightseeing (or meeting spots for businessmen). East Region is close to Changi Airport, a top international transfer choice and one of the busiest airport in the world. The associated demand could impact price. Zoning can well be an independent variable that contributes to pricing.

# Count each unique levels under prop_type_simplified
s_p_count <- listings1 %>%
  count(prop_type_simplified) %>%
  arrange(desc(n)) %>%
  mutate(total = sum(n)) %>%
  mutate(proportion = 100 * n/sum(n))
s_p_count
prop_type_simplifiedntotalproportion
Apartment1448371339   
Other953371325.7 
Condominium803371321.6 
House31037138.35
Serviced apartment19937135.36
# Create a palette that maps prop_type_simplified factor levels to colors
pal2 <- colorFactor(c("navy", "red", "yellow", "black", "green"), 
                    domain = c("Apartment", "Other", "Condominium", "House", "Serviced apartment"))

# Apply the palette to actual data
leaflet(data = listings1) %>% 
  addTiles() %>%
  addCircleMarkers(
    lng = ~longitude, 
    lat = ~latitude, 
    color = ~pal2(prop_type_simplified),
    radius = 3,
    stroke = FALSE, 
    fillOpacity = 0.5)

From both count and mapping, we can see that Apartment is the most widely offered property type. Given its geographical span, it could well mean prices are quite dispersed within the category. House and Serivced aparment are most clustered in the Central Region. We speculate that the ‘prop_type_simplified’ could enhance a likely high price that comes with its region identity.

4 Regression Analysis

4.1 Decide on which to use: price_4_nights or price_4_nights_log

# Create a new variable called price_4_nights
extra_listings <- listings1 %>%
  filter(maximum_nights >= 4) %>%   # The listing must accept at least 4 nights of stay
  mutate(price_4_nights = 
           ifelse(guests_included <= 1, # If only meant to accommodate 1 guest
                  (price + extra_people) * 4 + cleaning_fee,  # Need extra_people charge 
                  (price) * 4 + cleaning_fee)) # Price if can accommodate 2 or more guests

# See key stats of price for 2 people 4 nights in Singapore
fav_stats(extra_listings$price_4_nights)
minQ1medianQ3maxmeansdnmissing
562804948155.2e+048142.5e+0336540
density.default(extra_listings$price_4_nights)
## 
## Call:
##  density.default(x = extra_listings$price_4_nights)
## 
## Data: extra_listings$price_4_nights (3654 obs.); Bandwidth 'bw' = 69.65
## 
##        x               y           
##  Min.   : -153   Min.   :0.000000  
##  1st Qu.:12938   1st Qu.:0.000000  
##  Median :26028   Median :0.000000  
##  Mean   :26028   Mean   :0.000019  
##  3rd Qu.:39118   3rd Qu.:0.000000  
##  Max.   :52209   Max.   :0.001361
ggplot(extra_listings,
  aes(x = price_4_nights)) +
  geom_density()+
  theme_economist() +
  xlab("price for 2 people staying 4 nights")

ggplot(extra_listings, aes(x = log(price_4_nights))) +
  geom_density()+
  theme_economist() +
  xlab("log(price)")

  NULL
## NULL

Looking at our data, we believe that using the log(price_4_nights) would be a better variable to use for our regression model. This is so that a change in dependent variable could mean a multiplied % change in price_4_nights, rather than multiplied increments of coefficients. Otherwise, for data with price_4_nights several orders of magnitude larger, a model would need a disproportionate increase in independent variables to conform to a dependent variable.

# Create a variable of log price data
extra_listings_log <- extra_listings %>%
  mutate(price_4_nights_log = log(price_4_nights)) 

#ggpairs(extra_listings_log$["price_4_nights_log","prop_type_simplified","number_of_reviews","review_scores_rating"])
#ggpairs(extra_listings_log[,c("price_4_nights_log","prop_type_simplified","number_of_reviews","review_scores_rating")])

4.2 Speculate which numeric variables could signficantly impact prices

We need to see what are some possible numeric variables that could impact prices. Due to the display constraint of a correlation matrix in a R studio window. We divide the numeric variables into two groups. Also, we sieve out variables that we think are logically irrelevant to prices, such as id, minimum_nights, listings_count, availability, longitude, latitude.

# Create a dataframe for group 1 of numeric variables
matrix_data1 <- extra_listings_log %>%
  select_if(is.numeric) %>%
  select(!ends_with("id")) %>%
  select(!contains("minimum_nights")) %>%
  select(!contains("maximum_nights")) %>%
  select(!contains("listings_count")) %>%
  select(!starts_with("availability")) %>%
  select(!contains("latitude")) %>%
  select(!contains("longitude")) %>%
  select(accommodates:guests_included, price_4_nights_log) %>%
  select(-price)

# Visualize correlation matrix for group 1 of numeric variables
corr1 <- cor(matrix_data1)

ggcorrplot(corr1, 
           lab = TRUE)

# Create a ggpair matrix for group 1 of numeric variables
ggpairs(matrix_data1)

# Create a dataframe for group 2 of numeric variables
matrix_data2 <- extra_listings_log %>%
  select_if(is.numeric) %>%
  select(!ends_with("id")) %>%
  select(!contains("minimum_nights")) %>%
  select(!contains("maximum_nights")) %>%
  select(!contains("listings_count")) %>%
  select(!starts_with("availability")) %>%
  select(!contains("latitude")) %>%
  select(!contains("longitude")) %>%
  select(extra_people:price_4_nights_log) %>%
  select(-price_4_nights)

# Visualize correlation matrix for group 1 of numeric variables
corr2 <- cor(matrix_data2)

ggcorrplot(corr2, 
           lab = TRUE)

# Create a ggpair matrix for group 1 of numeric variables
ggpairs(matrix_data2)

Unfortunately, the correlation coefficients observed are relatively small. But this does not put us off because it confirms the need to incorporate other logical or character variables in predicting price_4_nights_log. Variables having a correlation coefficients greater or equal to 0.1 with price_4_nights_log include bedrooms, square_feet, guests_included, cleaning_fee, security_deposit, extra_people, review_scores_cleanliness. Although these will be numeric variables to closely watch later in models, except bedrooms, they are the variables not normally considered before booking an Airbnb. They can be theoretically useful, but may not apply in real life for predictions.

4.3 Model 1

# Model 1, includes simplified property type, number of reviews, review ratings
model1 <- lm(price_4_nights_log ~
             prop_type_simplified +
             number_of_reviews +
             review_scores_rating,
             data = extra_listings_log)

# Check if there are clear differences between prices of different property types
favstats(~price_4_nights | prop_type_simplified, data = extra_listings_log)
prop_type_simplifiedminQ1medianQ3maxmeansdnmissing
Apartment56272422756       4.01e+04700       1.67e+0314260
Condominium60356628915       4.02e+04855       2.17e+037930
House131250400586       3.56e+04775       2.89e+033040
Other60236404759       1.18e+04630       805       9340
Serviced apartment3486329471.28e+035.2e+04 2.41e+037.65e+031970
# Display table of coefficients
msummary(model1)
##                                         Estimate Std. Error t value Pr(>|t|)
## (Intercept)                             5.746432   0.101091   56.84  < 2e-16
## prop_type_simplifiedCondominium         0.192579   0.037372    5.15  2.8e-07
## prop_type_simplifiedHouse              -0.217104   0.051998   -4.18  3.1e-05
## prop_type_simplifiedOther              -0.146498   0.034495   -4.25  2.2e-05
## prop_type_simplifiedServiced apartment  0.605880   0.067233    9.01  < 2e-16
## number_of_reviews                      -0.000999   0.000323   -3.10    0.002
## review_scores_rating                    0.004810   0.001109    4.34  1.5e-05
##                                           
## (Intercept)                            ***
## prop_type_simplifiedCondominium        ***
## prop_type_simplifiedHouse              ***
## prop_type_simplifiedOther              ***
## prop_type_simplifiedServiced apartment ***
## number_of_reviews                      ** 
## review_scores_rating                   ***
## 
## Residual standard error: 0.678 on 2426 degrees of freedom
##   (1221 observations deleted due to missingness)
## Multiple R-squared:  0.0829, Adjusted R-squared:  0.0806 
## F-statistic: 36.6 on 6 and 2426 DF,  p-value: <2e-16
tab_model(model1)
  price 4 nights log
Predictors Estimates CI p
(Intercept) 5.75 5.55 – 5.94 <0.001
prop_type_simplified
[Condominium]
0.19 0.12 – 0.27 <0.001
prop_type_simplified
[House]
-0.22 -0.32 – -0.12 <0.001
prop_type_simplified
[Other]
-0.15 -0.21 – -0.08 <0.001
prop_type_simplified
[Serviced apartment]
0.61 0.47 – 0.74 <0.001
number_of_reviews -0.00 -0.00 – -0.00 0.002
review_scores_rating 0.00 0.00 – 0.01 <0.001
Observations 2433
R2 / R2 adjusted 0.083 / 0.081
# Check the residuals
autoplot(model1)

vif(model1) %>% #calculate variance-inflation and generalized variance-inflation factors
  kable() %>%  #table type in HTML
    kable_styling(bootstrap_options = "striped", full_width = F, position = "left") #styling of table
GVIF Df GVIF^(1/(2*Df))
prop_type_simplified 1.02 4 1.00
number_of_reviews 1.02 1 1.01
review_scores_rating 1.02 1 1.01

4.3.1 Model 1 Analysis

Interpret the coefficient review_scores_rating in terms of price_4_nights: On average for 2 listings that have a difference of 1 in their review_scores_rating, the listing with the higher review will have a higher price_4_nights_log of 0.0048099. This means a higher price_4_nights of of 0.48% (or \(e^{0.0048099}-1\)). The t-value is 4.338 > 2, and p value = 1.49e-05 < 0.05, these mean the coefficient is statistically significant at a 95% confidence level.

Interpret the coefficient prop_type_simplified in terms of price_4_nights: The regression model uses Apartment as the base level for prop_type_simplified and the coefficients of the property types can be stated as: On average, a Condominium will have a higher price_4_nights_log of 0.1925787 as compared to an Apartment. This means a higher price of 21.23% (or \(e^{0.1925787}-1\)) than an Apartment. On average, a House will have a lower price of 19.51% (or \(e^{-0.2171037}-1\)) than an Apartment. On average, a property type of Other will have a lower price of -13.63% (or \(e^{-0.1464977}-1\)) than an Apartment. On average, a Serviced apartment will have a higher price of 83.29% (or \(e^{0.6058804}-1\)) than an Apartment. All of the absolute values of coefficients’ t-value are greater than 2, and p value < 0.05, meaning the coefficients are statistically significant at a 95% confidence level.

The adjusted r square for this model is 0.08064 or 8.1%. This means that the model with these 3 variables account for only 8.1% of the variation in the price_4_nights_log.

4.4 Model 2

#Model 2, includes property type, number of reviews, reviews ratings and room type
model2 <- lm(price_4_nights_log ~
             prop_type_simplified +
             number_of_reviews +
             review_scores_rating +
             room_type, 
             data = extra_listings_log)

# Display table of coefficients
summary(model2)
## 
## Call:
## lm(formula = price_4_nights_log ~ prop_type_simplified + number_of_reviews + 
##     review_scores_rating + room_type, data = extra_listings_log)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -2.660 -0.351 -0.050  0.294  4.720 
## 
## Coefficients:
##                                         Estimate Std. Error t value Pr(>|t|)
## (Intercept)                             6.298363   0.083995   74.99  < 2e-16
## prop_type_simplifiedCondominium         0.044420   0.030806    1.44     0.15
## prop_type_simplifiedHouse              -0.002812   0.043087   -0.07     0.95
## prop_type_simplifiedOther               0.284729   0.033598    8.47  < 2e-16
## prop_type_simplifiedServiced apartment  0.675706   0.058336   11.58  < 2e-16
## number_of_reviews                      -0.001409   0.000264   -5.34  1.0e-07
## review_scores_rating                    0.003885   0.000908    4.28  1.9e-05
## room_typeHotel room                    -0.980829   0.047213  -20.77  < 2e-16
## room_typePrivate room                  -0.806641   0.026242  -30.74  < 2e-16
## room_typeShared room                   -1.425375   0.057221  -24.91  < 2e-16
##                                           
## (Intercept)                            ***
## prop_type_simplifiedCondominium           
## prop_type_simplifiedHouse                 
## prop_type_simplifiedOther              ***
## prop_type_simplifiedServiced apartment ***
## number_of_reviews                      ***
## review_scores_rating                   ***
## room_typeHotel room                    ***
## room_typePrivate room                  ***
## room_typeShared room                   ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.553 on 2423 degrees of freedom
##   (1221 observations deleted due to missingness)
## Multiple R-squared:  0.391,  Adjusted R-squared:  0.388 
## F-statistic:  173 on 9 and 2423 DF,  p-value: <2e-16
# Count different room types
extra_listings_log %>%
  count(room_type) %>%
  arrange(desc(n)) %>% 
  kable()
room_type n
Private room 1754
Entire home/apt 1336
Hotel room 333
Shared room 231
# Check the residuals
autoplot(model2)

### Model 2 Analysis

Model 2 includes the same variables as model 1 (prop_type_simplified, number_of_reviews, review_scores_rating) with a new added variable of room_type. As shown from the table above there are 4 main room types (Private room, Entire home/apt, Hotel room, Shared room). Model 2 uses the Entire home/apt as a base to assess the impact of the other room types on price.

From the model we can see that all 3 other room types have a negative relationship to an Entire home/apt. This makes sense as it would be more expensive to rent out an entire place as compared to just a single Private room or Shared room. A Shared room has the biggest negative relationship as compared to other room types which again makes sense as we expect a Shared room to be the cheapest as compared to the other more private options. The price of a Shared room is 75.96% (or \(e^{-1.4253747}-1\)) cheaper than the price of an Entire home/apt. The price of a Hotel room and Private room are 62.50% (or \(e^{-0.9808295}-1\)) and 55.36% (\(e^{-0.8066407}-1\) cheaper as compared to the price of an Entire home/apt. All of the absolute values of coefficients’ t-value are greater than 2, and p value < 0.05, meaning the coefficients are statistically significant at a 95% confidence level.

The adjusted r square for this model is 0.3883 or 38.8%. This means that model 2 accounts for 38.8% of the variation in the price_4_nights_log. This also shows that the room_type is a good variable to explain the variation as the r square increased by 30.7% from model 1 to model 2.

5 Further Variables to Explore

5.1 Model 3

At this stage, when we think of if bedrooms, bathrooms, beds can impact price_4_nights_log, we realize that only when room_type is Entire home/apt, these 3 variables truly matter. So we first employ filter and count functions to see if the Entire home/apt and other room types differ significantly for these 3 variables.

entire_count_bedrooms <- extra_listings_log %>%
  filter(room_type == "Entire home/apt") %>%
  count(bedrooms) %>%
  arrange(desc(n)) %>%
  mutate(total = sum(n)) %>%
  mutate(proportion = 100 * n/sum(n))

entire_count_bedrooms
bedroomsntotalproportion
2441133633    
1427133632    
3227133617    
0163133612.2  
44713363.52 
51513361.12 
6613360.449
10513360.374
313360.225
7213360.15 
entire_count_bathrooms <- extra_listings_log %>%
  filter(room_type == "Entire home/apt") %>%
  count(bathrooms) %>%
  arrange(desc(n)) %>%
  mutate(total = sum(n)) %>%
  mutate(proportion = 100 * n/sum(n))

entire_count_bathrooms
bathroomsntotalproportion
1  670133650.1   
2  391133629.3   
3  12413369.28  
1.56813365.09  
2.53513362.62  
3.51513361.12  
4  913360.674 
6  713360.524 
0  613360.449 
5  513360.374 
5.5213360.15  
4.5113360.0749
7  113360.0749
8  113360.0749
  113360.0749
entire_count_beds <- extra_listings_log %>%
  filter(room_type == "Entire home/apt") %>%
  count(beds) %>%
  arrange(desc(n)) %>%
  mutate(total = sum(n)) %>%
  mutate(proportion = 100 * n/sum(n))

entire_count_beds
bedsntotalproportion
1428133632     
2369133627.6   
3235133617.6   
4143133610.7   
56113364.57  
02913362.17  
62813362.1   
813360.599 
8713360.524 
7613360.449 
10613360.449 
16613360.449 
9313360.225 
14313360.225 
11213360.15  
13113360.0749
23113360.0749
private_count_bedrooms <- extra_listings_log %>%
  filter(room_type == "Private room") %>%
  count(bedrooms) %>%
  arrange(desc(n)) %>%
  mutate(total = sum(n)) %>%
  mutate(proportion = 100 * n/sum(n))

private_count_bedrooms
bedroomsntotalproportion
11553175488.5  
09917545.64 
26617543.76 
32017541.14 
717540.399
4417540.228
6317540.171
5217540.114
private_count_bathrooms <- extra_listings_log %>%
  filter(room_type == "Private room") %>%
  count(bathrooms) %>%
  arrange(desc(n)) %>%
  mutate(total = sum(n)) %>%
  mutate(proportion = 100 * n/sum(n))

private_count_bathrooms
bathroomsntotalproportion
1  1309175474.6  
2  14517548.27 
1.511517546.56 
0  4417542.51 
0.53417541.94 
2.52817541.6  
3  2517541.43 
4  2117541.2  
6  917540.513
8  817540.456
5  617540.342
12  417540.228
7  317540.171
9  217540.114
  117540.057
private_count_beds <- extra_listings_log %>%
  filter(room_type == "Private room") %>%
  count(beds) %>%
  arrange(desc(n)) %>%
  mutate(total = sum(n)) %>%
  mutate(proportion = 100 * n/sum(n))

private_count_beds
bedsntotalproportion
11209175468.9  
2340175419.4  
34917542.79 
04617542.62 
43917542.22 
2617541.48 
61417540.798
81017540.57 
7717540.399
5617540.342
10417540.228
14217540.114
16217540.114
hotel_count_bedrooms <- extra_listings_log %>%
  filter(room_type == "Hotel room") %>%
  count(bedrooms) %>%
  arrange(desc(n)) %>%
  mutate(total = sum(n)) %>%
  mutate(proportion = 100 * n/sum(n))

hotel_count_bedrooms
bedroomsntotalproportion
126333379   
03633310.8 
2223336.61
393332.7 
413330.3 
713330.3 
813330.3 
hotel_count_bathrooms <- extra_listings_log %>%
  filter(room_type == "Hotel room") %>%
  count(bathrooms) %>%
  arrange(desc(n)) %>%
  mutate(total = sum(n)) %>%
  mutate(proportion = 100 * n/sum(n))

hotel_count_bathrooms
bathroomsntotalproportion
1  12833338.4  
3  5533316.5  
4  4333312.9  
2  3633310.8  
6  173335.11 
0  113333.3  
5  83332.4  
1.573332.1  
8.573332.1  
10  53331.5  
16  43331.2  
8  33330.901
13  33330.901
0.523330.601
2.513330.3  
7  13330.3  
9  13330.3  
12.513330.3  
hotel_count_beds <- extra_listings_log %>%
  filter(room_type == "Hotel room") %>%
  count(beds) %>%
  arrange(desc(n)) %>%
  mutate(total = sum(n)) %>%
  mutate(proportion = 100 * n/sum(n))

hotel_count_beds
bedsntotalproportion
115733347.1  
25233315.6  
10203336.01 
8133333.9  
3123333.6  
6113333.3  
16113333.3  
0103333    
493332.7  
1493332.7  
3093332.7  
1263331.8  
1863331.8  
1323330.601
513330.3  
713330.3  
913330.3  
3613330.3  
5813330.3  
13330.3  
shared_count_bedrooms <- extra_listings_log %>%
  filter(room_type == "Shared room") %>%
  count(bedrooms) %>%
  arrange(desc(n)) %>%
  mutate(total = sum(n)) %>%
  mutate(proportion = 100 * n/sum(n))

shared_count_bedrooms
bedroomsntotalproportion
1231231100
shared_count_bathrooms <- extra_listings_log %>%
  filter(room_type == "Shared room") %>%
  count(bathrooms) %>%
  arrange(desc(n)) %>%
  mutate(total = sum(n)) %>%
  mutate(proportion = 100 * n/sum(n))

shared_count_bathrooms
bathroomsntotalproportion
1  7723133.3  
0  3323114.3  
3  3123113.4  
2  232319.96 
0.5112314.76 
4  112314.76 
5  82313.46 
1.572313.03 
2.572313.03 
7  72313.03 
6  62312.6  
8  42311.73 
9  22310.866
21  22310.866
7.512310.433
10  12310.433
shared_count_beds <- extra_listings_log %>%
  filter(room_type == "Shared room") %>%
  count(beds) %>%
  arrange(desc(n)) %>%
  mutate(total = sum(n)) %>%
  mutate(proportion = 100 * n/sum(n))

shared_count_beds
bedsntotalproportion
111723150.6  
22523110.8  
4162316.93 
6152316.49 
16102314.33 
892313.9  
1072313.03 
3252312.16 
52312.16 
1242311.73 
1442311.73 
332311.3  
022310.866
522310.866
922310.866
2022310.866
712310.433
2812310.433
4612310.433
room_type_count <- extra_listings_log %>%
  count(room_type) %>%
  arrange(desc(n)) %>%
  mutate(total = sum(n)) %>%
  mutate(proportion = 100 * n/sum(n))

room_type_count
room_typentotalproportion
Private room1754365448   
Entire home/apt1336365436.6 
Hotel room33336549.11
Shared room23136546.32

The above count and filter operations show that for Entire home/apt, bedrooms, beds, bathrooms counts differ significantly within itself. (And Entire home/apt accounts for 36.56% of data set, significant!) Within other types, the 3 variables will have much more significant counts for “1” over other numbers. Therefore, it is worth considering to downplay the importance ofbedrooms, beds, bathrooms, in predicting the price_4_nights_log for Private room, Hotel room, Shared room. For now, we will still handle the 4 room_types under the same model.

Do we want to build a separate model for `Entire home/apt??? probably at the end in conclusion? to show our critical thinking

# Model 3, includes property type, number of reviews, reviews ratings,room type, bedrooms, bathrooms and beds
model3 <-lm(price_4_nights_log ~
            prop_type_simplified +
            number_of_reviews +
            review_scores_rating +
            room_type + 
             bathrooms +
             bedrooms +
             beds,
            data = extra_listings_log)

# Display coefficients
summary(model3)
## 
## Call:
## lm(formula = price_4_nights_log ~ prop_type_simplified + number_of_reviews + 
##     review_scores_rating + room_type + bathrooms + bedrooms + 
##     beds, data = extra_listings_log)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.048 -0.296 -0.042  0.271  4.713 
## 
## Coefficients:
##                                         Estimate Std. Error t value Pr(>|t|)
## (Intercept)                             5.704932   0.084485   67.53  < 2e-16
## prop_type_simplifiedCondominium         0.026866   0.028600    0.94    0.348
## prop_type_simplifiedHouse              -0.019362   0.039933   -0.48    0.628
## prop_type_simplifiedOther               0.317102   0.031962    9.92  < 2e-16
## prop_type_simplifiedServiced apartment  0.660393   0.054394   12.14  < 2e-16
## number_of_reviews                      -0.001273   0.000245   -5.20  2.1e-07
## review_scores_rating                    0.005828   0.000854    6.82  1.1e-11
## room_typeHotel room                    -0.780591   0.046247  -16.88  < 2e-16
## room_typePrivate room                  -0.647655   0.025728  -25.17  < 2e-16
## room_typeShared room                   -1.233408   0.054809  -22.50  < 2e-16
## bathrooms                              -0.018002   0.007902   -2.28    0.023
## bedrooms                                0.272009   0.014032   19.38  < 2e-16
## beds                                   -0.003766   0.003554   -1.06    0.290
##                                           
## (Intercept)                            ***
## prop_type_simplifiedCondominium           
## prop_type_simplifiedHouse                 
## prop_type_simplifiedOther              ***
## prop_type_simplifiedServiced apartment ***
## number_of_reviews                      ***
## review_scores_rating                   ***
## room_typeHotel room                    ***
## room_typePrivate room                  ***
## room_typeShared room                   ***
## bathrooms                              *  
## bedrooms                               ***
## beds                                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.511 on 2404 degrees of freedom
##   (1237 observations deleted due to missingness)
## Multiple R-squared:  0.48,   Adjusted R-squared:  0.477 
## F-statistic:  185 on 12 and 2404 DF,  p-value: <2e-16
# Calculate Variance Inflation Factor
car::vif(model3)
##                      GVIF Df GVIF^(1/(2*Df))
## prop_type_simplified 1.96  4            1.09
## number_of_reviews    1.03  1            1.01
## review_scores_rating 1.05  1            1.02
## room_type            2.30  3            1.15
## bathrooms            1.39  1            1.18
## bedrooms             1.40  1            1.18
## beds                 1.42  1            1.19
# Check the residuals
autoplot(model3)

model3_1 <-lm(price_4_nights_log ~
            prop_type_simplified +
            number_of_reviews +
            review_scores_rating +
            room_type + 
             bathrooms +
             bedrooms,
            data = extra_listings_log)

# Display coefficients
summary(model3_1)
## 
## Call:
## lm(formula = price_4_nights_log ~ prop_type_simplified + number_of_reviews + 
##     review_scores_rating + room_type + bathrooms + bedrooms, 
##     data = extra_listings_log)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.045 -0.298 -0.042  0.271  4.710 
## 
## Coefficients:
##                                         Estimate Std. Error t value Pr(>|t|)
## (Intercept)                             5.702491   0.083941   67.93  < 2e-16
## prop_type_simplifiedCondominium         0.029510   0.028584    1.03   0.3020
## prop_type_simplifiedHouse              -0.020137   0.039906   -0.50   0.6139
## prop_type_simplifiedOther               0.322511   0.031664   10.19  < 2e-16
## prop_type_simplifiedServiced apartment  0.669639   0.054221   12.35  < 2e-16
## number_of_reviews                      -0.001302   0.000244   -5.33  1.1e-07
## review_scores_rating                    0.005844   0.000847    6.90  6.8e-12
## room_typeHotel room                    -0.793785   0.045507  -17.44  < 2e-16
## room_typePrivate room                  -0.644492   0.025695  -25.08  < 2e-16
## room_typeShared room                   -1.239739   0.054107  -22.91  < 2e-16
## bathrooms                              -0.020562   0.007727   -2.66   0.0078
## bedrooms                                0.268517   0.013545   19.82  < 2e-16
##                                           
## (Intercept)                            ***
## prop_type_simplifiedCondominium           
## prop_type_simplifiedHouse                 
## prop_type_simplifiedOther              ***
## prop_type_simplifiedServiced apartment ***
## number_of_reviews                      ***
## review_scores_rating                   ***
## room_typeHotel room                    ***
## room_typePrivate room                  ***
## room_typeShared room                   ***
## bathrooms                              ** 
## bedrooms                               ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.512 on 2417 degrees of freedom
##   (1225 observations deleted due to missingness)
## Multiple R-squared:  0.478,  Adjusted R-squared:  0.476 
## F-statistic:  202 on 11 and 2417 DF,  p-value: <2e-16
# Calculate Variance Inflation Factor
car::vif(model3_1)
##                      GVIF Df GVIF^(1/(2*Df))
## prop_type_simplified 1.90  4            1.08
## number_of_reviews    1.02  1            1.01
## review_scores_rating 1.04  1            1.02
## room_type            2.16  3            1.14
## bathrooms            1.33  1            1.15
## bedrooms             1.30  1            1.14
# Check the residuals
autoplot(model3_1)

5.1.1 Model 3 Analysis

Model 3 includes the same variables as model 2 (property_type, number of reviews, review_scores_rating, room_type) with 3 new added variables of bathrooms, bedrooms and beds. Adjusted R square improves largely from 38.3% to 47.7%. But it looks like beds has a low |t-value| < 2 and large p-value > 0.05. For later models, beds should be excluded. And after exclusion we get adjusted R square = 47.6%, about the same as before.

5.2 Model 4

Model 4 consists of the same variables of model 3 with the added variablehost_is_superhost specifying if the host is a super host.

# Model 4, includes property type, number of reviews, reviews ratings,room type, bedrooms, bathrooms, and is the host a super host
model4<-lm(price_4_nights_log ~
             prop_type_simplified +
             number_of_reviews +
             review_scores_rating +
             room_type + 
             bathrooms +
             bedrooms +
             host_is_superhost,
             data = extra_listings_log)

summary(model4)
## 
## Call:
## lm(formula = price_4_nights_log ~ prop_type_simplified + number_of_reviews + 
##     review_scores_rating + room_type + bathrooms + bedrooms + 
##     host_is_superhost, data = extra_listings_log)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.144 -0.308 -0.035  0.273  4.730 
## 
## Coefficients:
##                                         Estimate Std. Error t value Pr(>|t|)
## (Intercept)                             5.749358   0.085402   67.32  < 2e-16
## prop_type_simplifiedCondominium         0.026704   0.028684    0.93  0.35197
## prop_type_simplifiedHouse              -0.030721   0.040007   -0.77  0.44262
## prop_type_simplifiedOther               0.312977   0.031784    9.85  < 2e-16
## prop_type_simplifiedServiced apartment  0.655642   0.054375   12.06  < 2e-16
## number_of_reviews                      -0.001354   0.000245   -5.53  3.5e-08
## review_scores_rating                    0.005094   0.000872    5.84  5.8e-09
## room_typeHotel room                    -0.787715   0.045633  -17.26  < 2e-16
## room_typePrivate room                  -0.641314   0.025932  -24.73  < 2e-16
## room_typeShared room                   -1.241512   0.054209  -22.90  < 2e-16
## bathrooms                              -0.018057   0.007760   -2.33  0.02005
## bedrooms                                0.270599   0.013695   19.76  < 2e-16
## host_is_superhostTRUE                   0.118067   0.030562    3.86  0.00011
##                                           
## (Intercept)                            ***
## prop_type_simplifiedCondominium           
## prop_type_simplifiedHouse                 
## prop_type_simplifiedOther              ***
## prop_type_simplifiedServiced apartment ***
## number_of_reviews                      ***
## review_scores_rating                   ***
## room_typeHotel room                    ***
## room_typePrivate room                  ***
## room_typeShared room                   ***
## bathrooms                              *  
## bedrooms                               ***
## host_is_superhostTRUE                  ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.512 on 2402 degrees of freedom
##   (1239 observations deleted due to missingness)
## Multiple R-squared:  0.481,  Adjusted R-squared:  0.479 
## F-statistic:  186 on 12 and 2402 DF,  p-value: <2e-16
# Calculate Variance Inflation Factor
car::vif(model4)
##                      GVIF Df GVIF^(1/(2*Df))
## prop_type_simplified 1.92  4            1.08
## number_of_reviews    1.03  1            1.01
## review_scores_rating 1.10  1            1.05
## room_type            2.18  3            1.14
## bathrooms            1.34  1            1.16
## bedrooms             1.32  1            1.15
## host_is_superhost    1.09  1            1.04
# Check the residuals
autoplot(model4)

### Model 4 Analysis

Based on the regression, the fact of if a host is a super host has some effect on the price of the property. As shown from the table above, if the host is a super host (i.e. host_is_superhost = TRUE), the price would increase by 12.53% \(e^{0.1180666}-1\). The t-value and p-value are statistically significant at 95% level.

Model 4 has an adjusted R square of 47.86%, which means that the variables in model 4 explains only 47.86% of the variation in price_4_nights_log. Model 3 on the other hand has an adjusted R square of 47.60%. The different in adjusted r squares for the 2 models is only 0.26%. This tells us that the variable host_is_superhost might not be a good indicator for the variation in price of an Airbnb.

5.3 Model 5

Model 5 consists of the same variables of model 4 with the added variable is_location_exact on if the exact location of the Air BNB is given.

model5<-lm(price_4_nights_log ~
             prop_type_simplified +
             number_of_reviews +
             review_scores_rating +
             room_type +
             bathrooms +
             bedrooms +
             host_is_superhost +
             is_location_exact,
             data = extra_listings_log)


summary(model5)
## 
## Call:
## lm(formula = price_4_nights_log ~ prop_type_simplified + number_of_reviews + 
##     review_scores_rating + room_type + bathrooms + bedrooms + 
##     host_is_superhost + is_location_exact, data = extra_listings_log)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.065 -0.305 -0.028  0.270  4.701 
## 
## Coefficients:
##                                         Estimate Std. Error t value Pr(>|t|)
## (Intercept)                             5.663481   0.088165   64.24  < 2e-16
## prop_type_simplifiedCondominium         0.024968   0.028610    0.87  0.38290
## prop_type_simplifiedHouse              -0.041084   0.039992   -1.03  0.30439
## prop_type_simplifiedOther               0.306420   0.031745    9.65  < 2e-16
## prop_type_simplifiedServiced apartment  0.651628   0.054237   12.01  < 2e-16
## number_of_reviews                      -0.001413   0.000245   -5.78  8.6e-09
## review_scores_rating                    0.005182   0.000870    5.96  2.9e-09
## room_typeHotel room                    -0.781930   0.045534  -17.17  < 2e-16
## room_typePrivate room                  -0.632975   0.025956  -24.39  < 2e-16
## room_typeShared room                   -1.232961   0.054108  -22.79  < 2e-16
## bathrooms                              -0.017484   0.007740   -2.26  0.02398
## bedrooms                                0.270130   0.013658   19.78  < 2e-16
## host_is_superhostTRUE                   0.116816   0.030480    3.83  0.00013
## is_location_exactTRUE                   0.097542   0.025884    3.77  0.00017
##                                           
## (Intercept)                            ***
## prop_type_simplifiedCondominium           
## prop_type_simplifiedHouse                 
## prop_type_simplifiedOther              ***
## prop_type_simplifiedServiced apartment ***
## number_of_reviews                      ***
## review_scores_rating                   ***
## room_typeHotel room                    ***
## room_typePrivate room                  ***
## room_typeShared room                   ***
## bathrooms                              *  
## bedrooms                               ***
## host_is_superhostTRUE                  ***
## is_location_exactTRUE                  ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.51 on 2401 degrees of freedom
##   (1239 observations deleted due to missingness)
## Multiple R-squared:  0.484,  Adjusted R-squared:  0.481 
## F-statistic:  173 on 13 and 2401 DF,  p-value: <2e-16
# Calculate Variance Inflation Factor
car::vif(model5)
##                      GVIF Df GVIF^(1/(2*Df))
## prop_type_simplified 1.93  4            1.09
## number_of_reviews    1.03  1            1.01
## review_scores_rating 1.10  1            1.05
## room_type            2.20  3            1.14
## bathrooms            1.34  1            1.16
## bedrooms             1.32  1            1.15
## host_is_superhost    1.09  1            1.04
## is_location_exact    1.02  1            1.01
# Check the residuals
autoplot(model5)

### Model 5 Analysis

As with the super host variable in model 4 the fact if the exact location of the Air BNB is given seems insignificant. As shown from the table above, if the exact location is given, the price would increase by 10.25% (\(e^{0.0975425}-1\). The t-value and p-value are statistically significant at 95% level.

Model 5 has an adjusted R square of 48.14%, which means that the variables in model 5 explains only 48.14% of the variation in log price for 4 nights. Model 4 on the other hand has an adjusted R square of 47.86%. The difference in adjusted r squares for the 2 models is only 0.28%. This tells us that the variable on if the exact location of the Airbnb is given might not be a good indicator for the variation in price of an Airbnb.

5.4 Model 6

Based on the data set we have, we observed that the variable neighborhood_group_cleansed groups the neighborhoods in Singapore into 5 main areas: Central Region, East Region, West Region, North-East Region, North Region. While we could use these regions to account for the different neighborhoods in Singapore, we decided to regroup the neighborhoods in Singapore based on price per square feet. Our new variable, neighborhood_simplified, groups the neighborhoods in Singapore into 4 main regions, A,B,C and D. The neighborhoods in A consists of the areas with the highest price per square feet, which is mostly the areas around Central Singapore, while the price per square feet decreases in each group as they move away from Central Singapore

#count(distinct(extra_listings_log$neighbourhood_group_cleansed))
#skim(extra_listings_log)
# Create neighborhood_simplified 
extra_listings_log <- extra_listings_log %>%
  mutate(neighbourhood_simplified = case_when(
    neighbourhood_cleansed %in% c("Rochor", "Downtown Core", "River Valley","Singapore River","Orchard","Tanglin","Southern Islands","Museum","Marina South") ~ "A",
    neighbourhood_cleansed %in% c("Kallang", "Geylang","Outram", "Novena", "Bukit Merah","Queenstown","Bukit Timah","Toa Payoh","Marine Parade","Clementi","Newton","Bishan","Ang Mo Kio","Central Water Catchment") ~ "B", 
    neighbourhood_cleansed %in% c("Bedok","Jurong West","Woodlands","Hougang","Jurong East","Chua Chu Kang","Serangoon","Tampines","Bukit Batok","Sengkang","Pasir Ris","Yishun","Punggol","Bukit Panjang","Sembawang","Pioneer","Sungei Kadut","Tuas") ~ "C",
    TRUE ~ "D"))

After creating our own neighbourhood_simplified, we can map them and see how they are distributed

# Count each unique levels under neighbourhood_simplified
n_simplfied_count <- extra_listings_log %>%
  count(neighbourhood_simplified) %>%
  arrange(desc(n)) %>%
  mutate(total = sum(n)) %>%
  mutate(proportion = 100 * n/sum(n))
n_simplfied_count
neighbourhood_simplifiedntotalproportion
B2061365456.4 
A864365423.6 
C690365418.9 
D3936541.07
# Create a palette that maps prop_type_simplified factor levels to colors
pal3 <- colorFactor(c("navy", "red", "yellow", "black"), 
                    domain = c("B", "A", "C", "D"))

# Apply the palette to actual data
leaflet(data = extra_listings_log) %>% 
  addTiles() %>%
  addCircleMarkers(
    lng = ~longitude, 
    lat = ~latitude, 
    color = ~pal3(neighbourhood_simplified),
    radius = 3,
    stroke = FALSE, 
    fillOpacity = 0.5)

Now, let’s build a regression model 6, that’s model 5 including an additional variable neighbourhood_simplified.

extra_listings_log %>%
  count(neighbourhood_simplified) %>%
  arrange(desc(n)) %>% 
  kable()
neighbourhood_simplified n
B 2061
A 864
C 690
D 39
extra_listings_log %>%
  count(neighbourhood_group_cleansed) %>%
  arrange(desc(n)) %>% 
  kable()
neighbourhood_group_cleansed n
Central Region 2837
East Region 281
West Region 280
North-East Region 161
North Region 95
model6<-lm(price_4_nights_log ~
            prop_type_simplified +
             number_of_reviews +
             review_scores_rating +
             room_type +
             bathrooms +
             bedrooms +
             host_is_superhost +
             is_location_exact +
             neighbourhood_simplified,
             data = extra_listings_log)

summary(model6)
## 
## Call:
## lm(formula = price_4_nights_log ~ prop_type_simplified + number_of_reviews + 
##     review_scores_rating + room_type + bathrooms + bedrooms + 
##     host_is_superhost + is_location_exact + neighbourhood_simplified, 
##     data = extra_listings_log)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.022 -0.288 -0.021  0.252  4.749 
## 
## Coefficients:
##                                         Estimate Std. Error t value Pr(>|t|)
## (Intercept)                             5.839150   0.089345   65.36  < 2e-16
## prop_type_simplifiedCondominium         0.026885   0.028198    0.95   0.3405
## prop_type_simplifiedHouse              -0.020663   0.039843   -0.52   0.6041
## prop_type_simplifiedOther               0.272795   0.031513    8.66  < 2e-16
## prop_type_simplifiedServiced apartment  0.644998   0.053537   12.05  < 2e-16
## number_of_reviews                      -0.001418   0.000241   -5.88  4.6e-09
## review_scores_rating                    0.005062   0.000858    5.90  4.1e-09
## room_typeHotel room                    -0.774654   0.044881  -17.26  < 2e-16
## room_typePrivate room                  -0.615885   0.026338  -23.38  < 2e-16
## room_typeShared room                   -1.241076   0.053474  -23.21  < 2e-16
## bathrooms                              -0.019421   0.007652   -2.54   0.0112
## bedrooms                                0.273705   0.013480   20.31  < 2e-16
## host_is_superhostTRUE                   0.117738   0.030136    3.91  9.6e-05
## is_location_exactTRUE                   0.083461   0.025784    3.24   0.0012
## neighbourhood_simplifiedB              -0.201694   0.025015   -8.06  1.2e-15
## neighbourhood_simplifiedC              -0.216492   0.034488   -6.28  4.1e-10
## neighbourhood_simplifiedD              -0.376206   0.094974   -3.96  7.7e-05
##                                           
## (Intercept)                            ***
## prop_type_simplifiedCondominium           
## prop_type_simplifiedHouse                 
## prop_type_simplifiedOther              ***
## prop_type_simplifiedServiced apartment ***
## number_of_reviews                      ***
## review_scores_rating                   ***
## room_typeHotel room                    ***
## room_typePrivate room                  ***
## room_typeShared room                   ***
## bathrooms                              *  
## bedrooms                               ***
## host_is_superhostTRUE                  ***
## is_location_exactTRUE                  ** 
## neighbourhood_simplifiedB              ***
## neighbourhood_simplifiedC              ***
## neighbourhood_simplifiedD              ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.503 on 2398 degrees of freedom
##   (1239 observations deleted due to missingness)
## Multiple R-squared:   0.5,   Adjusted R-squared:  0.497 
## F-statistic:  150 on 16 and 2398 DF,  p-value: <2e-16
# Calculate Variance Inflation Factor
car::vif(model6)
##                          GVIF Df GVIF^(1/(2*Df))
## prop_type_simplified     2.03  4            1.09
## number_of_reviews        1.03  1            1.02
## review_scores_rating     1.10  1            1.05
## room_type                2.36  3            1.15
## bathrooms                1.35  1            1.16
## bedrooms                 1.32  1            1.15
## host_is_superhost        1.10  1            1.05
## is_location_exact        1.04  1            1.02
## neighbourhood_simplified 1.24  3            1.04
# Check the residuals
autoplot(model6)

5.4.1 Model 6 Analysis

Model 6 adds on to model 5 with the added variable of neighborhood_simplified which accounts for the different neighborhoods in Singapore.

The model uses neighborhood group A as the baseline and compares the other neighborhoods against A. As we expected, neighborhood A has the highest average price with the other neighborhoods being on average lower than group A. Neighborhood B for example of average has a price that is 18.27% (\(e^{-0.2016941}-1\) lower in price as compared to neighborhood A. Neighborhood C has an average price which is 19.47% (\(e^{-0.2164924}-1\) lower than neighborhood A. Again, as we expected, neighborhood C is lower than both neighborhood A and B.

Model 6 has an adjusted R square of 49.68%, which means that the variables in model 5 explains 49.68% of the variation in price_4_nights_log. Model 5 on the other hand has an adjusted R square of 48.14%. The different in adjusted r squares for the 2 models is 1.54%. This shows that the neighborhood_simplified adds marginal value to the model.

5.5 Model 7

model7<-lm(price_4_nights_log ~
            prop_type_simplified +
             number_of_reviews +
             review_scores_rating +
             room_type +
             bathrooms +
             bedrooms +
             host_is_superhost +
             is_location_exact +
             neighbourhood_simplified +
             cancellation_policy,
             data = extra_listings_log)

summary(model7)
## 
## Call:
## lm(formula = price_4_nights_log ~ prop_type_simplified + number_of_reviews + 
##     review_scores_rating + room_type + bathrooms + bedrooms + 
##     host_is_superhost + is_location_exact + neighbourhood_simplified + 
##     cancellation_policy, data = extra_listings_log)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -2.963 -0.291 -0.030  0.254  4.695 
## 
## Coefficients:
##                                                 Estimate Std. Error t value
## (Intercept)                                     5.750028   0.093225   61.68
## prop_type_simplifiedCondominium                 0.026662   0.028090    0.95
## prop_type_simplifiedHouse                      -0.020389   0.039718   -0.51
## prop_type_simplifiedOther                       0.270925   0.031530    8.59
## prop_type_simplifiedServiced apartment          0.614170   0.053855   11.40
## number_of_reviews                              -0.001488   0.000242   -6.16
## review_scores_rating                            0.005363   0.000858    6.25
## room_typeHotel room                            -0.752566   0.045091  -16.69
## room_typePrivate room                          -0.602120   0.026550  -22.68
## room_typeShared room                           -1.207991   0.054070  -22.34
## bathrooms                                      -0.019900   0.007638   -2.61
## bedrooms                                        0.273979   0.013448   20.37
## host_is_superhostTRUE                           0.108227   0.030164    3.59
## is_location_exactTRUE                           0.079084   0.025708    3.08
## neighbourhood_simplifiedB                      -0.201191   0.025011   -8.04
## neighbourhood_simplifiedC                      -0.206334   0.034434   -5.99
## neighbourhood_simplifiedD                      -0.371903   0.094604   -3.93
## cancellation_policymoderate                    -0.001368   0.034392   -0.04
## cancellation_policystrict_14_with_grace_period  0.093647   0.029292    3.20
## cancellation_policysuper_strict_30              0.826958   0.505235    1.64
## cancellation_policysuper_strict_60              0.814680   0.504675    1.61
##                                                Pr(>|t|)    
## (Intercept)                                     < 2e-16 ***
## prop_type_simplifiedCondominium                 0.34264    
## prop_type_simplifiedHouse                       0.60776    
## prop_type_simplifiedOther                       < 2e-16 ***
## prop_type_simplifiedServiced apartment          < 2e-16 ***
## number_of_reviews                               8.7e-10 ***
## review_scores_rating                            4.7e-10 ***
## room_typeHotel room                             < 2e-16 ***
## room_typePrivate room                           < 2e-16 ***
## room_typeShared room                            < 2e-16 ***
## bathrooms                                       0.00923 ** 
## bedrooms                                        < 2e-16 ***
## host_is_superhostTRUE                           0.00034 ***
## is_location_exactTRUE                           0.00212 ** 
## neighbourhood_simplifiedB                       1.4e-15 ***
## neighbourhood_simplifiedC                       2.4e-09 ***
## neighbourhood_simplifiedD                       8.7e-05 ***
## cancellation_policymoderate                     0.96827    
## cancellation_policystrict_14_with_grace_period  0.00141 ** 
## cancellation_policysuper_strict_30              0.10181    
## cancellation_policysuper_strict_60              0.10660    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.501 on 2394 degrees of freedom
##   (1239 observations deleted due to missingness)
## Multiple R-squared:  0.505,  Adjusted R-squared:  0.501 
## F-statistic:  122 on 20 and 2394 DF,  p-value: <2e-16
# Calculate Variance Inflation Factor
car::vif(model7)
##                          GVIF Df GVIF^(1/(2*Df))
## prop_type_simplified     2.10  4            1.10
## number_of_reviews        1.05  1            1.02
## review_scores_rating     1.11  1            1.05
## room_type                2.49  3            1.16
## bathrooms                1.36  1            1.17
## bedrooms                 1.33  1            1.15
## host_is_superhost        1.11  1            1.05
## is_location_exact        1.04  1            1.02
## neighbourhood_simplified 1.26  3            1.04
## cancellation_policy      1.18  4            1.02
# Check the residuals
autoplot(model7)

### Model 7 Analysis

Model 7 has achieved the best adjusted R square so far at 50.08%.

5.6 Model 8 for trial and error

# Add more variables to make the model a better predictor
model8<-lm(price_4_nights_log ~
            prop_type_simplified +
            number_of_reviews +
            review_scores_rating +
            room_type +
            bathrooms +
            bedrooms +
            host_is_superhost +
            is_location_exact +
            cancellation_policy,
            data = extra_listings_log)

summary(model8)
## 
## Call:
## lm(formula = price_4_nights_log ~ prop_type_simplified + number_of_reviews + 
##     review_scores_rating + room_type + bathrooms + bedrooms + 
##     host_is_superhost + is_location_exact + cancellation_policy, 
##     data = extra_listings_log)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.009 -0.301 -0.034  0.266  4.656 
## 
## Coefficients:
##                                                 Estimate Std. Error t value
## (Intercept)                                     5.585853   0.092528   60.37
## prop_type_simplifiedCondominium                 0.025250   0.028491    0.89
## prop_type_simplifiedHouse                      -0.037991   0.039860   -0.95
## prop_type_simplifiedOther                       0.305446   0.031734    9.63
## prop_type_simplifiedServiced apartment          0.616853   0.054556   11.31
## number_of_reviews                              -0.001474   0.000245   -6.01
## review_scores_rating                            0.005488   0.000870    6.31
## room_typeHotel room                            -0.758748   0.045740  -16.59
## room_typePrivate room                          -0.618329   0.026237  -23.57
## room_typeShared room                           -1.201610   0.054753  -21.95
## bathrooms                                      -0.017933   0.007724   -2.32
## bedrooms                                        0.270420   0.013621   19.85
## host_is_superhostTRUE                           0.108287   0.030487    3.55
## is_location_exactTRUE                           0.091348   0.025812    3.54
## cancellation_policymoderate                    -0.018761   0.034822   -0.54
## cancellation_policystrict_14_with_grace_period  0.083848   0.029619    2.83
## cancellation_policysuper_strict_30              0.968193   0.512438    1.89
## cancellation_policysuper_strict_60              0.953047   0.511848    1.86
##                                                Pr(>|t|)    
## (Intercept)                                     < 2e-16 ***
## prop_type_simplifiedCondominium                 0.37557    
## prop_type_simplifiedHouse                       0.34063    
## prop_type_simplifiedOther                       < 2e-16 ***
## prop_type_simplifiedServiced apartment          < 2e-16 ***
## number_of_reviews                               2.2e-09 ***
## review_scores_rating                            3.3e-10 ***
## room_typeHotel room                             < 2e-16 ***
## room_typePrivate room                           < 2e-16 ***
## room_typeShared room                            < 2e-16 ***
## bathrooms                                       0.02033 *  
## bedrooms                                        < 2e-16 ***
## host_is_superhostTRUE                           0.00039 ***
## is_location_exactTRUE                           0.00041 ***
## cancellation_policymoderate                     0.59010    
## cancellation_policystrict_14_with_grace_period  0.00468 ** 
## cancellation_policysuper_strict_30              0.05896 .  
## cancellation_policysuper_strict_60              0.06273 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.508 on 2397 degrees of freedom
##   (1239 observations deleted due to missingness)
## Multiple R-squared:  0.489,  Adjusted R-squared:  0.486 
## F-statistic:  135 on 17 and 2397 DF,  p-value: <2e-16

6 Compare coefficients across different models

huxreg("Model 1" = model1, "Model 2" =  model2, "Model 3" =  model3, "Model 4" =  model4, "Model 5" =  model5, "Model 6" = model6, "Model 7" = model7, statistics = c('#observations' = 'nobs', 
                                'R squared' = 'r.squared', 
                                'Adj. R Squared' = 'adj.r.squared', 
                                'Residual SE' = 'sigma'), 
                 bold_signif = 0.05, 
                 stars = NULL)
Model 1Model 2Model 3Model 4Model 5Model 6Model 7
(Intercept)5.746 6.298 5.705 5.749 5.663 5.839 5.750 
(0.101)(0.084)(0.084)(0.085)(0.088)(0.089)(0.093)
prop_type_simplifiedCondominium0.193 0.044 0.027 0.027 0.025 0.027 0.027 
(0.037)(0.031)(0.029)(0.029)(0.029)(0.028)(0.028)
prop_type_simplifiedHouse-0.217 -0.003 -0.019 -0.031 -0.041 -0.021 -0.020 
(0.052)(0.043)(0.040)(0.040)(0.040)(0.040)(0.040)
prop_type_simplifiedOther-0.146 0.285 0.317 0.313 0.306 0.273 0.271 
(0.034)(0.034)(0.032)(0.032)(0.032)(0.032)(0.032)
prop_type_simplifiedServiced apartment0.606 0.676 0.660 0.656 0.652 0.645 0.614 
(0.067)(0.058)(0.054)(0.054)(0.054)(0.054)(0.054)
number_of_reviews-0.001 -0.001 -0.001 -0.001 -0.001 -0.001 -0.001 
(0.000)(0.000)(0.000)(0.000)(0.000)(0.000)(0.000)
review_scores_rating0.005 0.004 0.006 0.005 0.005 0.005 0.005 
(0.001)(0.001)(0.001)(0.001)(0.001)(0.001)(0.001)
room_typeHotel room     -0.981 -0.781 -0.788 -0.782 -0.775 -0.753 
     (0.047)(0.046)(0.046)(0.046)(0.045)(0.045)
room_typePrivate room     -0.807 -0.648 -0.641 -0.633 -0.616 -0.602 
     (0.026)(0.026)(0.026)(0.026)(0.026)(0.027)
room_typeShared room     -1.425 -1.233 -1.242 -1.233 -1.241 -1.208 
     (0.057)(0.055)(0.054)(0.054)(0.053)(0.054)
bathrooms          -0.018 -0.018 -0.017 -0.019 -0.020 
          (0.008)(0.008)(0.008)(0.008)(0.008)
bedrooms          0.272 0.271 0.270 0.274 0.274 
          (0.014)(0.014)(0.014)(0.013)(0.013)
beds          -0.004                     
          (0.004)                    
host_is_superhostTRUE               0.118 0.117 0.118 0.108 
               (0.031)(0.030)(0.030)(0.030)
is_location_exactTRUE                    0.098 0.083 0.079 
                    (0.026)(0.026)(0.026)
neighbourhood_simplifiedB                         -0.202 -0.201 
                         (0.025)(0.025)
neighbourhood_simplifiedC                         -0.216 -0.206 
                         (0.034)(0.034)
neighbourhood_simplifiedD                         -0.376 -0.372 
                         (0.095)(0.095)
cancellation_policymoderate                              -0.001 
                              (0.034)
cancellation_policystrict_14_with_grace_period                              0.094 
                              (0.029)
cancellation_policysuper_strict_30                              0.827 
                              (0.505)
cancellation_policysuper_strict_60                              0.815 
                              (0.505)
#observations2433     2433     2417     2415     2415     2415     2415     
R squared0.083 0.391 0.480 0.481 0.484 0.500 0.505 
Adj. R Squared0.081 0.388 0.477 0.479 0.481 0.497 0.501 
Residual SE0.678 0.553 0.511 0.512 0.510 0.503 0.501 

7 Predicting the cost of an Air Bnb in Singapore

Based on all our models, we feel that model 7 is the model which would be the best predictor as it has the highest adjusted r square of 0.501. The equation for model 7 is the following:

price_4_nights_log = 5.75 + (0.0267 * prop_type_simplifiedCondominium) + (prop_type_simplifiedHouse * -0.0204) + (prop_type_simplifiedOther * 0.271) + (prop_type_simplifiedServiced apartment * 0.614) + (number_of_reviews * -0.00149) + (0.00536 * review_scores_rating) + (-0.753 * room_typeHotel room ) + (-0.602 * room_typePrivate room) + (-1.21 * room_typeShared room) + (bathrooms * -0.0199) + (bedrooms * 0.274) + (0.108 * host_is_superhostTRUE) + (is_location_exactTRUE * 0.0791) + (neighbourhood_simplifiedB * -0.201) + (neighbourhood_simplifiedC * -0.206) + (neighbourhood_simplifiedD * -0.372) (cancellation_policymoderate * -0.00137) + (cancellation_policystrict_14_with_grace_period * 0.0936) + (cancellation_policysuper_strict_30 * 0.827) + (cancellation_policysuper_strict_60 * 0.815)

We will use our model 7 to try and predict the price for 4 nights in Singapore where it is an apartment with a private room, have at least 10 reviews, and an average rating of at least 90. We added an extra variable of location being in neighborhood A, which is the central location in Singapore. Use your best model to predict the total cost to stay at this Airbnb for 4 nights. Include the appropriate 95% interval with your prediction. Report the point prediction and interval in terms of price_4_nights.

predicted_cost <- exp(5.75 + (0.0267 * 0) + 
                        (0 * -0.0204) + 
                        (0 * 0.271) + 
                        (1 * 0.614) +
                        (10 * -0.00149) +
                        (0.00536 * 90) +
                        (-0.753 * 0 ) +
                        (-0.602 * 1) +
                        (-1.21 * 0) +
                        (0 * -0.0199) +
                        (0 * 0.274) +
                        (0.108 * 0) +
                        (0 * 0.0791) +
                        (0 * -0.201) +
                        (0 * -0.206) +
                        (0 * -0.372) + 
                        (0 * -0.00137) +
                        (0 * 0.0936) +
                        (0 * 0.827) +
                        (0 * 0.815))

print(predicted_cost)
## [1] 508
SE_residual <- exp(0.501) # Calculate the Standard Error

upper_interval <- predicted_cost + (2*SE_residual)
lower_interval <- predicted_cost - (2*SE_residual)

specify_decimal <- function(x, k) trimws(format(round(x, k), nsmall=k)) #decimal for the intervals

paste("Upper Interval is $",specify_decimal(upper_interval,2))
## [1] "Upper Interval is $ 510.80"
paste("Lower Interval is $",specify_decimal(lower_interval,2))
## [1] "Lower Interval is $ 504.20"

Based on the model we created, we found that the predicted cost of an Air Bnb in Singapore with the characteristics of it being for 4 nights, in an apartment with a private room, have at least 10 reviews with an average rating of 90 and be located in the central district of Singapore, to be $507.50. Based on our model, we are also 95% confident that the price will be between $504.20 and $510.80.